{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have already seen how the RAG is implemented in data.\n",
    "In this note, we will focus more on how to make each component more configurable, \n",
    "espeically the data processing pipeline to help us with experiments where we will see how useful they are in benchmarking."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the data pipeline and the backend data processing\n",
    "from adalflow.core.embedder import Embedder\n",
    "from adalflow.core.types import ModelClientType\n",
    "from adalflow.components.data_process import TextSplitter, ToEmbeddings\n",
    "from adalflow.core.container import Sequential\n",
    "\n",
    "\n",
    "def prepare_data_pipeline():\n",
    "    model_kwargs = {\n",
    "        \"model\": \"text-embedding-3-small\",\n",
    "        \"dimensions\": 256,\n",
    "        \"encoding_format\": \"float\",\n",
    "    }\n",
    "\n",
    "    splitter_config = {\"split_by\": \"word\", \"split_length\": 50, \"split_overlap\": 10}\n",
    "\n",
    "    splitter = TextSplitter(**splitter_config)\n",
    "    embedder = Embedder(\n",
    "        model_client=ModelClientType.OPENAI(), model_kwargs=model_kwargs\n",
    "    )\n",
    "    embedder_transformer = ToEmbeddings(embedder, batch_size=2)\n",
    "    data_transformer = Sequential(splitter, embedder_transformer)\n",
    "    print(data_transformer)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lightrag-project",
   "language": "python",
   "name": "light-rag-project"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
